#!/usr/bin/env python3
import json
import re

## Globals Go Here

infile = '/gscratch/comdata/users/kaylea/taboo/raw_data/wiktionary_clean.json'
outfile = '/gscratch/comdata/users/kaylea/taboo/processed_data/narrow_cleanedParsedWikts.tsv'
errorFile = '/gscratch/comdata/users/kaylea/taboo/processed_data/step1WiktSorterParserErrors.txt'
grammarlist = ['plural', 'intensifier', 'initialism', 'acronym', 'spelling', 'synonym']

with open(infile, 'r') as jFH: #read in a wikt.json file
	jData = json.load(jFH) #takes a bit


#print('"Definition"')

old = ''

with open(outfile, 'w') as outf: #fresh new file
	outf.write('dict_word' + '\t' + 'dict_definition' + '\t' + 'dict_tag' + '\t' + 'dict_taboo' + '\n')

err = open(errorFile, 'w')

for item in jData:
	try:
		if (item['redirect']): #skip all redirects since we parse the population
			continue
	except:
		pass #not a redir so on we go
	try:
		if (item['lang'] != 'English'):
			print(item['lang'])
	except:
		print(item)
		continue
	try:
		for sense in item['senses']: #why is 'rare' getting picked up? are definitions leaking into tag? grep for abuse and see this happen :/
			taboo = False #each sense can be taboo or not, reset to default
			try: 
				badTags = [tag for tag in sense['tags'] if tag in ('form-of, past, 3, alt-of, synonym')] #produces a generator through list comprehension
				if list(badTags):
					#err.write(f"Ignoring the cases where wiktionary is documenting past tenses: {sense}\n")
					continue #goes to the next sense
				#print(sense)
				separator = '.' 
				definition = separator.join(sense['glosses']) #join defs together 
				if (definition == '.'):  #exclude blank definitions 
					continue 
				#any -- is Using recipe from https://www.geeksforgeeks.org/python-test-if-string-contains-element-from-list/
				if any(ele in definition for ele in grammarlist): #exclude the documentation of these types of words, they seem to go untagged :/
					continue
				if (definition == 'en'): 
					continue 
				if (definition == 'en.'): 
					continue 
				if (definition == 'synonym of en'): 
					continue
				if (definition == 'synonym of en.'): 
					continue
				if (definition == 'initialism of en'): 
					continue


				for tag in sense['tags']: 
					#figurative and idiomatic *not* included
					#if re.search("slur|rude|impolite|derogatory|pejorat|offensi|taboo|vulg|euph|slang|explet|humor|4chan|abuse|abusive|^cant$|thieves' cant|a mild oath", tag): 
					if re.search("euph", tag): 
						taboo = True 
					else: 
						continue #starts out False. if we've changed it to True, don't change it back

				line = f"{item['word']}" + '\t' + f"{definition}" + '\t' + f"{sense['tags']}" + '\t' + f"{taboo}" + '\n'
				with open(outfile, 'a+') as outf:
					outf.write(line)


			except: 
				continue

	except: 
		continue

err.close()

#definition = definition.replace('"', '') 
#with open('tagfile.csv', 'r') as tagFH: #read in a file of tags

#output just items with that tag. format: word, triggering keyword, gloss
